******************************************************************************************************************
* PREDICTED DISABILITY SCALE
******************************************************************************************************************

/* SET-UP
		All of these model syntaxes have a section at the starts that sets the details of the analyses that follow - the idea is that it is easier to change 
		things once, than to have to do find-and-replace throughout a series of syntax files.
		We have filled these in with the variables that we used - but you will need to change this to the variables in your data.
		Obviously you also need to load your data here!
*/
// These are the same for all the example syntax files
global empvar				"rworknew"				// the binary employment variable (1=working, 0=non-working)
global countryvar			"country"				// the categorical variable for country, with each country denoted by a value
global disvar				"llsiH"					// the binary self-reported general disability variable (1 for has a disability, 0 for does not)
global pweight				"rwtresp2"				// the probability weight used for this survey
global controls				"i.ragey i.rmale" 		// control variables
global outputdir 			"${user}\OneDrive - King's College London\Disability work\ESRC Future Leaders Disability\Phase 1 (Dis Emp Rates) - Intl\ELSA-SHARE-HRS\Outputs"	// the file location to save the output tables to
/* Setting the analysis sample (needs to be done from the outset) 
   - these are the ages and countries used in our analysis - you should set this to whatever you are using. */
keep if (ragey>=50 & ragey<70) 
keep if (year==2015 & inlist(country,19,23,29,34,35)) | ( (year==2013 & !inlist(country,19,23,34,35)) | (country==61 & year==2014) | (country==51 & year==2010) )	
drop if missing(${empvar}, ragey, rmale) | (missing(${disvar}) & !inlist(country,51,61) )
// Final things
svyset [pw=${pweight}], strata(country)		// for some commands, it's easier to use the svy prefix than to set weights with [pweight=${pweight}]
// These are the extra globals for the predicted disability scale
	global indicators			"rwalk100a rsita rchaira rstoopa rarmsa rpusha rlifta rdimea i.rclimT anyADL i.anyIADLt rmhcase i.visionT i.hearingT"	// this is the list of functional limitation/symptom variables used in the predicted disability scale
	// These both include the control variables for the disability scale
	global disweight_controls	"rmale ${empvar}"				// binary or continuous variables
	global disweight_factors 	"i.ragey i.country i.risced3"	// These are the variables where dummy variables need to be created
	// Whether to have a threshold just below or just above the observed prevalence of self-reported disability (see below/Appendices)
	global whichversion			"H"

	
*__________________________________________________________________________________________________
* 
**# CREATING THE PREDICTED DISABILITY SCALE
*__________________________________________________________________________________________________

// Dropping item-missing cases
local strippedvars = subinstr("${controls} ${empvar} ${indicators}", "i.", "", .) 	// to get a version without factor variable notation ("i."), needed for the following line
egen misscount = rowmiss(`strippedvars')
	drop if misscount~=0					
	drop misscount
* Re-doing weights for the resulting subsample, so that average of weight is 1 in each country
levelsof ${countryvar}			
foreach i in `r(levels)' {
	sum ${pweight} if ${countryvar}==`i'
	replace ${pweight} = ${pweight} /`r(mean)' if ${countryvar}==`i' 
	/**/				}

	
// Creating copies of each of the control variables, which are set to the original value for the 
// main estimate (using 'logit'), but the sample mean for prediction (using 'predict')
		/*  NOTE: setting binary variables to their mean is slightly odd, but works well for ensuring that predicted disability and observed 
			disability have the same prevalence.  This is also done for country, to ensure that people with the same functional impairments have 
			the same predicted probability of disability whichever country they are in (but we control for country to remove spurious associations). */
* Firstly, need to get dummy variables for 3-level (or more) categorical variables
	xi ${disweight_factors}, prefix(I)
	unab disweight_controls_expanded: ${disweight_controls} I* 
* Then create copies of each variable
	global disweight_controls2 ""			// set to empty in advance of the list being created just below
	foreach var in `disweight_controls_expanded' 	{
		gen `var'_pred = `var'
			label var `var'_pred	"`var' copy for prediction (actual value for model, mean for prediction)"
		quietly svy: mean `var'_pred  
			matrix working = e(b)
			gen `var'_mean = working[1,1]
			label var `var'_mean "`var' mean value in full sample"
		global disweight_controls2 "${disweight_controls2} `var'_pred" 	// This is for the regression model below
	/**/ 											} 
	

// Creating the regression-based weights
eststo ${disvar}:		logit ${disvar} ${indicators} 		${disweight_controls2} [pweight=${pweight}]
	* Setting controls to be a constant value, so prediction is based purely on functional limitations
	foreach var in `disweight_controls_expanded' {
		replace `var'_pred = `var'_mean 
	/**/ 										 }
	* The prediction
	predict p_predicted                                                               	// The probability of disability
		label var p_predicted "PREDICTED VALUES of ${disvar} from regression weights"
	* Ajudstment so that the mean value of this is the same as ${disvar}
	svy: mean ${disvar} p_predicted if ~missing(${disvar}) & ~missing(p_predicted)	// To show that prevalence is very close (if not quite identical) - accounted for below
		local adjustment = r(table)[1,1] / r(table)[1,2]
		replace p_predicted = p_predicted*`adjustment'
	* Final tidying
	drop *_pred *_mean 
	capture drop I*
	

// Outputting the disability weights
esttab ${disvar} using "${outputdir}\2_dis_weights_predicted_${disvar}.csv", csv replace not p brackets nostar nonum nodepvars b(%4.3f) ///
	wide aic(%6.0fc) bic(%6.0fc) drop(*_pred) 


*_____________________________________________________________________________________________________________________________________________________
*
**# TURNING PROBAILITIES/SCALES INTO DISABILITY VARS
*_____________________________________________________________________________________________________________________________________________________

//	NOTE: The svy: tab command is extremely slow, so there's various display flags so that you know where it crashes (if it crashes)


// 	Setting % with a disability to be the same as general self-reported disability	
* Firstly, get the proportion of people who report a disability
	prop ${disvar} [pweight=${pweight}]
	matrix output = e(b)
	global prop_dis			= 100 - (100*output[1,2])


// Finding the cut-offs on the predicted disability scale 
/*			Note that in nearly all cases, there is no exact cut-off to match the observed prevalence of disability.
			Instead, we have a choice between a cut-off just below this threshold, or one just above it.
			It makes sense to choose the one that is closest, but this is best done by hand.		*/
	// Ensure that varying precision in globals doesn't cause rounding problems (in Stata, see 'help precision')
		replace p_predicted = round(p_predicted, 0.000001)
		recast double p_predicted 					
	// Find the cutpoint on the latent-variable score that produces the same level of disability
		* Binary var with threshold just BELOW observed disability
		_pctile p_predicted if !missing(${disvar}) [pw=${pweight}], percentiles(${prop_dis})
			scalar pred_cutoff = `r(r1)'
		gen byte ${disvar}_predicted_fxdL = (p_predicted > pred_cutoff + 0.0000001)				// see help precision for why the + 0.0000001 is added
			label var ${disvar}_predicted_fxdL				"Predicted disability with fixed cut-off (just below threshold)"
		* Binary var with threshold just ABOVE observed disability (from the next-lowest value of p_predicted going in reverse order)
		preserve
			collapse (percent) prev=${disvar} (count) count=${disvar} [pw=${pweight}] if !missing(${disvar}), by(p_predicted)
			gen prevvalue = p_predicted[_n-1] if abs(p_predicted-pred_cutoff)<0.0000001		// see help precision to understand why "if p_predicted==pred_cutoff" doesn't quite work here
			sum prevvalue 
			scalar pred_prevvalue = `r(mean)'
		restore
		gen byte ${disvar}_predicted_fxdH = (p_predicted > (pred_prevvalue + 0.0000001) )		// the 0.0000001 is because of precision errors, even with the syntax above...
			label var ${disvar}_predicted_fxdH				"Predicted disability with fixed cut-off (just above threshold)"
dis _newline(20) "Cutoffs are" _newline(1) "L=" pred_cutoff _newline(1) "H=" pred_prevvalue

	
// Deciding whether just above or just below the cut-off is best
svy: mean ${disvar}* if !missing(${disvar})
pause Check whether to use the lower or the upper threshold
rename ${disvar}_predicted_fxd${whichversion}	${disvar}_predicted_fxd 
svy: mean ${disvar}* if !missing(${disvar})		// A check
pause Check again that this is the correct choice
drop ${disvar}_predicted_fxd? 

		
		
*__________________________________________________________________________________________________
* 
**# ANALYSIS USING THIS SCALE
* 		note: THIS IS THE SAME AS SUPPLEMENTARY CODE A2, BUT WITH ${disvar} replaced by ${disvar}_predicted_fxd
*		BUT IT IS REPRODUCED HERE FOR CONVENIENCE.
*__________________________________________________________________________________________________

/* 	THE MODELS
		There are two models here - one for disability prevalence, the other for employment
		After each model, we use 'margins' to get the average marginal effects
		We use 'eststo' to store the results of each model - d=disability, e=employment, M=marginal effects, Mr=contrasts of marginal effects
*/
eststo d_${disvar}_predicted_fxd		: logit ${disvar}_predicted_fxd  i.${countryvar} 				  $controls [pw=${pweight}]
	eststo dM_${disvar}_predicted_fxd 	:		margins i.${countryvar}, 								 at(${controlsmeans}) post
eststo e_${disvar}_predicted_fxd		: logit ${empvar}  ib(0).${disvar}_predicted_fxd##i.${countryvar} $controls [pw=${pweight}]
	eststo eM_${disvar}_predicted_fxd	:		margins i.${disvar}_predicted_fxd, over(${countryvar})	 at(${controlsmeans}) post
	estimates restore e_${disvar}_predicted_fxd
	eststo eMr_${disvar}_predicted_fxd	:		margins r.${disvar}_predicted_fxd@${countryvar}, 		 at(${controlsmeans}) post
	

/* THE RESULTS TABLE
		Producing nice tables in Stata is tricky. This is one way of producing 
		The first step is to rename the variables so that they simply show the name of the country.
		(As you will see in the full replication file (5_nonbootstrap.do), this also enables us to match different models in the same row).
		We then use the ESTTAB command to produce the tables - all of these options can be tweaked as needed.
*/
// Renaming the coefficients to simply have the country name in them
levelsof ${countryvar}
foreach country in `r(levels)'	{
	local countrylab: label (${countryvar}) `country'
	local countrylab = subinstr("`countrylab'", " ", "_", .)		// estttab, rename() doesn't work with spaces, so need to remove this, and use the 'varlabels' option to insert the space back in
	local rename_eMr `"`rename_eMr' r1vs0.${disvar}_predicted_fxd@`country'.${countryvar} "`countrylab'" "'
	local rename_eM  `"`rename_eM' `country'.${countryvar}#1.${disvar}_predicted_fxd "`countrylab'" "'
	local rename_dM  `"`rename_dM' `country'.${countryvar} "`countrylab'" "'
/**/							}
// The output options for all three tables
global esttabopts `"nostar b(%3.1f) ci(%3.1f) transform(@*100 100) varwidth(20) modelwidth(8 15)"'
global esttabopts `"${esttabopts} coeflabels(`labels') nonum replace nolegend nobaselevels wide mtitles(LLSI)"'
global esttabopts `"${esttabopts} addnotes("Table created on `c(current_date)'")"'
// Finally, generating the results tables themselves
esttab eMr_* using "${outputdir}\predicted_empR.rtf", ${esttabopts} rename(`rename_eMr') title("Disability employment gap")
esttab eM_*  using "${outputdir}\predicted_empM.rtf", ${esttabopts} rename(`rename_eM' ) title("Disability employment rate") drop(*0.${disvar}_predicted_fxd*)
esttab dM_*  using "${outputdir}\predicted_disR.rtf", ${esttabopts} rename(`rename_dM' ) title("Disability rate")
